/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.analysis; import java.io.*; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Token; /** The tokenizer used for Nutch document text. Implemented in terms of our * JavaCC-generated lexical analyzer, {@link NutchAnalysisTokenManager}, shared * with the query parser. */ public final class NutchDocumentTokenizer extends Tokenizer implements NutchAnalysisConstants { private NutchAnalysisTokenManager tokenManager; /** Construct a tokenizer for the text in a Reader. */ public NutchDocumentTokenizer(Reader reader) { super(reader); tokenManager = new NutchAnalysisTokenManager(reader); } /** Returns the next token in the stream, or null at EOF. */ public final Token next() throws IOException { net.nutch.analysis.Token t; try { loop: { while (true) { t = tokenManager.getNextToken(); switch (t.kind) { // skip query syntax tokens case EOF: case WORD: case ACRONYM: case SIGRAM: break loop; default: } } } } catch (TokenMgrError e) { // translate exceptions throw new IOException("Tokenizer error:" + e); } if (t.kind == EOF) // translate tokens return null; else { return new Token(t.image,t.beginColumn,t.endColumn,tokenImage[t.kind]); } } /** For debugging. */ public static void main(String[] args) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("Text: "); String line = in.readLine(); Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line)); Token token; System.out.print("Tokens: "); while ((token = tokenizer.next()) != null) { System.out.print(token.termText()); System.out.print(" "); } System.out.println(); } } }